library(MASS)
library(DT)
library(car)
house_data<-read.csv("data/kc_house_data.csv")
DT::datatable(house_data, class = "stripe cell-border", filter = 'top', extensions = 'ColReorder', options = list(autoWidth = TRUE, dom = 'Rlfrtip', autoWidth = TRUE))
It seems your data is too big for client-side DataTables. You may consider server-side processing: https://rstudio.github.io/DT/server.htmlIt seems your data is too big for client-side DataTables. You may consider server-side processing: https://rstudio.github.io/DT/server.html
# 傾き:268.5、切片:59953.2
house_lm1<-lm(price~sqft_above, data=house_data)
house_lm1
Call:
lm(formula = price ~ sqft_above, data = house_data)
Coefficients:
(Intercept) sqft_above
59953.2 268.5
# サマリー:決定係数:0.3667、切片と、傾きのp値は2e-16
summary(house_lm1)
Call:
lm(formula = price ~ sqft_above, data = house_data)
Residuals:
Min 1Q Median 3Q Max
-913132 -165624 -41468 109327 5339232
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 59953.2 4729.8 12.68 <2e-16 ***
sqft_above 268.5 2.4 111.87 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 292200 on 21611 degrees of freedom
Multiple R-squared: 0.3667, Adjusted R-squared: 0.3667
F-statistic: 1.251e+04 on 1 and 21611 DF, p-value: < 2.2e-16
要約で大切な箇所は、Pr(>|t|) (p値)とMultiple R-squared(決定係数)
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 59953.2 4729.8 12.68 <2e-16 ***
sqft_above 268.5 2.4 111.87 <2e-16 ***Multiple R-squared: 0.3667, Adjusted R-squared: 0.3667
誤差(ε)は正規分布に従うと仮定していたので、実際の残差が正規分布に従っているかを確認 正規分布にしたがっていないならば、仮定が崩れる。
hist(house_lm1$residuals, breaks=seq(-1e+06,5.4e+06,1e+05))
標準正規分布に変換した場合、どの位離れているか?を確認
## qqプロット
qqnorm(house_lm1$residuals)
qqline(house_lm1$residuals, col="red")
正規性が保たれていないので、価格の対数でモデルを作成してみる。
house_lm2<-lm(log(price)~sqft_above, data=house_data)
summary(house_lm2)
Call:
lm(formula = log(price) ~ sqft_above, data = house_data)
Residuals:
Min 1Q Median 3Q Max
-1.39738 -0.31422 -0.00033 0.28155 1.80559
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.236e+01 6.810e-03 1815.5 <2e-16 ***
sqft_above 3.828e-04 3.455e-06 110.8 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.4206 on 21611 degrees of freedom
Multiple R-squared: 0.3622, Adjusted R-squared: 0.3621
F-statistic: 1.227e+04 on 1 and 21611 DF, p-value: < 2.2e-16
hist(house_lm2$residuals,breaks=seq(-2, 2, 0.05))
qqnorm(house_lm2$residuals)
qqline(house_lm2$residuals, col="red")
誤差の正規性は良さそう。 #クックの距離 外れ値があるかを調べる
house_lm2_wip <- house_lm2
ck_dist <- cooks.distance(house_lm2_wip)
ck_dist[ck_dist == max(ck_dist)]
12778
0.01984297
総距離は12778、最も外れているのは0.01984297 描画してみる。
plot(house_lm2_wip)
クック距離が、4/データ数より大きいデータ点はインフルエンスが大きいと言われている。 上図だと(1623、18595、12778)。外れ値を除外して、モデルを再構築
house_data_wip <- house_data[-c(1623, 18595, 12778), ]
house_lm2_wip<-lm(log(price)~sqft_above, data=house_data_wip)
ckck_dist <- cooks.distance(house_lm2_wip)
ck_dist[ck_dist == max(ck_dist)]
12778
0.01984297
summary(house_lm2_wip)
Call:
lm(formula = log(price) ~ sqft_above, data = house_data_wip)
Residuals:
Min 1Q Median 3Q Max
-1.3961 -0.3142 -0.0004 0.2817 1.8050
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.236e+01 6.823e-03 1811.6 <2e-16 ***
sqft_above 3.842e-04 3.465e-06 110.9 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.4204 on 21608 degrees of freedom
Multiple R-squared: 0.3626, Adjusted R-squared: 0.3625
F-statistic: 1.229e+04 on 1 and 21608 DF, p-value: < 2.2e-16
plot(house_lm2_wip)
今回は、外れ値を除くと、決定係数もクックの距離の大きくなってしまったので、外さない方が良い。
waterfront,view,condition,gradeを変換する。変換は > as.factor(因子型)に変換する
house_data$waterfront<-as.factor(house_data$waterfront)
house_data$view<-as.factor(house_data$view)
house_data$condition<-as.factor(house_data$condition)
house_data$grade<-as.factor(house_data$grade)
id,date,zipcode,lat,long,yr_built,yr_renovated を除いて重回帰分析 説明変数を増やせば、決定係数が上がるので、説明変数を増やすことへのペナルティーを加えた上で分析 分析結果は、数学的に証明されているAICで評価(592752.6)
house_lm3<-lm(price~.-id-date-zipcode-lat-long-yr_built-yr_renovated, data=house_data)
AIC(house_lm3)
[1] 592752.6
一方、自由度調整済み決定係数(Adjusted R-squared: 0.6467)は数学的な根拠が提案。
summary(house_lm3)
Call:
lm(formula = price ~ . - id - date - zipcode - lat - long - yr_built -
yr_renovated, data = house_data)
Residuals:
Min 1Q Median 3Q Max
-1629728 -117831 -18516 90412 4229913
Coefficients: (1 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) 6.789e+04 2.183e+05 0.311 0.755857
bedrooms -1.746e+04 2.085e+03 -8.374 < 2e-16 ***
bathrooms -2.431e+03 3.343e+03 -0.727 0.467052
sqft_living 1.876e+02 4.662e+00 40.230 < 2e-16 ***
sqft_lot 2.075e-02 5.186e-02 0.400 0.689052
floors 2.254e+04 3.860e+03 5.838 5.36e-09 ***
waterfront1 5.203e+05 2.135e+04 24.367 < 2e-16 ***
view1 1.432e+05 1.221e+04 11.733 < 2e-16 ***
view2 8.365e+04 7.388e+03 11.322 < 2e-16 ***
view3 1.353e+05 1.015e+04 13.324 < 2e-16 ***
view4 2.769e+05 1.567e+04 17.673 < 2e-16 ***
condition2 -1.267e+04 4.396e+04 -0.288 0.773148
condition3 -1.912e+04 4.090e+04 -0.468 0.640121
condition4 3.152e+04 4.093e+04 0.770 0.441150
condition5 1.071e+05 4.117e+04 2.601 0.009291 **
grade3 2.751e+04 2.553e+05 0.108 0.914193
grade4 3.835e+04 2.255e+05 0.170 0.864945
grade5 2.267e+04 2.222e+05 0.102 0.918741
grade6 5.815e+04 2.221e+05 0.262 0.793445
grade7 9.320e+04 2.221e+05 0.420 0.674761
grade8 1.555e+05 2.221e+05 0.700 0.483866
grade9 2.874e+05 2.222e+05 1.294 0.195850
grade10 4.741e+05 2.223e+05 2.132 0.032979 *
grade11 7.454e+05 2.226e+05 3.348 0.000814 ***
grade12 1.209e+06 2.237e+05 5.404 6.60e-08 ***
grade13 2.498e+06 2.310e+05 10.813 < 2e-16 ***
sqft_above -5.868e+01 4.644e+00 -12.636 < 2e-16 ***
sqft_basement NA NA NA NA
sqft_living15 1.839e+01 3.669e+00 5.013 5.41e-07 ***
sqft_lot15 -7.014e-01 7.906e-02 -8.872 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 218200 on 21584 degrees of freedom
Multiple R-squared: 0.6472, Adjusted R-squared: 0.6467
F-statistic: 1414 on 28 and 21584 DF, p-value: < 2.2e-16
hist(house_lm3$residuals, breaks=seq(-1.7e+06,6.0e+06,5e+04))
qqプロット
qqnorm(house_lm3$residuals)
qqline(house_lm3$residuals, col="red")
正規性がないので、価格に対数( log(price) )を取ってみる。
house_lm4<-lm(log(price)~.-id-date-zipcode-lat-long-yr_built-yr_renovated, data=house_data)
summary(house_lm4)
Call:
lm(formula = log(price) ~ . - id - date - zipcode - lat - long -
yr_built - yr_renovated, data = house_data)
Residuals:
Min 1Q Median 3Q Max
-1.47369 -0.23660 0.01019 0.22325 1.33856
Coefficients: (1 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.162e+01 3.318e-01 35.030 < 2e-16 ***
bedrooms -1.626e-02 3.168e-03 -5.132 2.88e-07 ***
bathrooms -1.585e-02 5.080e-03 -3.120 0.001814 **
sqft_living 2.618e-04 7.085e-06 36.953 < 2e-16 ***
sqft_lot 2.853e-07 7.881e-08 3.621 0.000295 ***
floors 7.108e-02 5.866e-03 12.116 < 2e-16 ***
waterfront1 3.671e-01 3.245e-02 11.315 < 2e-16 ***
view1 2.037e-01 1.855e-02 10.978 < 2e-16 ***
view2 1.349e-01 1.123e-02 12.013 < 2e-16 ***
view3 1.567e-01 1.543e-02 10.159 < 2e-16 ***
view4 2.648e-01 2.381e-02 11.121 < 2e-16 ***
condition2 -4.816e-02 6.680e-02 -0.721 0.470924
condition3 5.527e-02 6.214e-02 0.889 0.373824
condition4 1.338e-01 6.219e-02 2.151 0.031468 *
condition5 2.674e-01 6.256e-02 4.274 1.93e-05 ***
grade3 1.438e-01 3.880e-01 0.371 0.710829
grade4 2.210e-01 3.426e-01 0.645 0.518868
grade5 3.019e-01 3.377e-01 0.894 0.371286
grade6 4.932e-01 3.375e-01 1.461 0.143912
grade7 6.818e-01 3.375e-01 2.020 0.043378 *
grade8 8.589e-01 3.375e-01 2.544 0.010951 *
grade9 1.061e+00 3.377e-01 3.141 0.001685 **
grade10 1.224e+00 3.378e-01 3.623 0.000291 ***
grade11 1.347e+00 3.383e-01 3.982 6.85e-05 ***
grade12 1.451e+00 3.399e-01 4.268 1.98e-05 ***
grade13 1.674e+00 3.510e-01 4.768 1.87e-06 ***
sqft_above -1.203e-04 7.057e-06 -17.047 < 2e-16 ***
sqft_basement NA NA NA NA
sqft_living15 8.770e-05 5.575e-06 15.731 < 2e-16 ***
sqft_lot15 -8.186e-07 1.201e-07 -6.814 9.75e-12 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.3316 on 21584 degrees of freedom
Multiple R-squared: 0.6041, Adjusted R-squared: 0.6036
F-statistic: 1176 on 28 and 21584 DF, p-value: < 2.2e-16
hist(house_lm4$residuals, breaks=seq(-1.6, 1.6, 0.05))
qqnorm(house_lm4$residuals)
qqline(house_lm4$residuals, col="red")
正規性は保たれたが、summuryを見ると bedrooms -1.626e-02 bathrooms -1.585e-02 sqft_above -1.203e-04 の回帰係数(傾き)がマイナスになっている。これは、ベッド数が少なくなると、価格が下がるという事であり、モデルがおかしそうだ。
説明変数間で相関が強くなっていないか?を確認
vif(house_lm4)
Error in vif.default(house_lm4) :
there are aliased coefficients in the model
there are aliased coefficients in the model
とは完全に相関してる項目が存在しているというエラー。 何が、相関しているかを確認
alias(lm(log(price)~.-id-date-zipcode-lat-long-yr_built-yr_renovated, data=house_data))
Model :
log(price) ~ (id + date + bedrooms + bathrooms + sqft_living +
sqft_lot + floors + waterfront + view + condition + grade +
sqft_above + sqft_basement + yr_built + yr_renovated + zipcode +
lat + long + sqft_living15 + sqft_lot15) - id - date - zipcode -
lat - long - yr_built - yr_renovated
Complete :
(Intercept) bedrooms bathrooms sqft_living sqft_lot floors waterfront1 view1 view2 view3
sqft_basement 0 0 0 1 0 0 0 0 0 0
view4 condition2 condition3 condition4 condition5 grade3 grade4 grade5 grade6 grade7
sqft_basement 0 0 0 0 0 0 0 0 0 0
grade8 grade9 grade10 grade11 grade12 grade13 sqft_above sqft_living15 sqft_lot15
sqft_basement 0 0 0 0 0 0 -1 0 0
sqft_basementとsqft_living、sqft_aboveの相関係数が±1なので、sqft_basementを削除した上で、多重共線性(マルチコ)を確認 一般に、5以上で要注意。10以上でマルチコありとなる
house_lm4<-lm(log(price)~.-id-date-zipcode-lat-long-yr_built-yr_renovated-sqft_basement,data=house_data)
vif(house_lm4)
GVIF Df GVIF^(1/(2*Df))
bedrooms 1.706847 1 1.306464
bathrooms 3.008411 1 1.734477
sqft_living 8.322593 1 2.884890
sqft_lot 2.094488 1 1.447235
floors 1.972327 1 1.404396
waterfront 1.548928 1 1.244559
view 1.818152 4 1.077591
condition 1.235554 4 1.026793
grade 4.263892 11 1.068138
sqft_above 6.712648 1 2.590878
sqft_living15 2.870156 1 1.694153
sqft_lot15 2.114837 1 1.454248
AICを再確認
AIC(house_lm4)
[1] 13650.99
13650.99 < 592752.6 (説明変数削除前から、AICが小さくなっている=モデルが良くなっている) 続いて、サマリーを再確認
summary(house_lm4)
Call:
lm(formula = log(price) ~ . - id - date - zipcode - lat - long -
yr_built - yr_renovated - sqft_basement, data = house_data)
Residuals:
Min 1Q Median 3Q Max
-1.47369 -0.23660 0.01019 0.22325 1.33856
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.162e+01 3.318e-01 35.030 < 2e-16 ***
bedrooms -1.626e-02 3.168e-03 -5.132 2.88e-07 ***
bathrooms -1.585e-02 5.080e-03 -3.120 0.001814 **
sqft_living 2.618e-04 7.085e-06 36.953 < 2e-16 ***
sqft_lot 2.853e-07 7.881e-08 3.621 0.000295 ***
floors 7.108e-02 5.866e-03 12.116 < 2e-16 ***
waterfront1 3.671e-01 3.245e-02 11.315 < 2e-16 ***
view1 2.037e-01 1.855e-02 10.978 < 2e-16 ***
view2 1.349e-01 1.123e-02 12.013 < 2e-16 ***
view3 1.567e-01 1.543e-02 10.159 < 2e-16 ***
view4 2.648e-01 2.381e-02 11.121 < 2e-16 ***
condition2 -4.816e-02 6.680e-02 -0.721 0.470924
condition3 5.527e-02 6.214e-02 0.889 0.373824
condition4 1.338e-01 6.219e-02 2.151 0.031468 *
condition5 2.674e-01 6.256e-02 4.274 1.93e-05 ***
grade3 1.438e-01 3.880e-01 0.371 0.710829
grade4 2.210e-01 3.426e-01 0.645 0.518868
grade5 3.019e-01 3.377e-01 0.894 0.371286
grade6 4.932e-01 3.375e-01 1.461 0.143912
grade7 6.818e-01 3.375e-01 2.020 0.043378 *
grade8 8.589e-01 3.375e-01 2.544 0.010951 *
grade9 1.061e+00 3.377e-01 3.141 0.001685 **
grade10 1.224e+00 3.378e-01 3.623 0.000291 ***
grade11 1.347e+00 3.383e-01 3.982 6.85e-05 ***
grade12 1.451e+00 3.399e-01 4.268 1.98e-05 ***
grade13 1.674e+00 3.510e-01 4.768 1.87e-06 ***
sqft_above -1.203e-04 7.057e-06 -17.047 < 2e-16 ***
sqft_living15 8.770e-05 5.575e-06 15.731 < 2e-16 ***
sqft_lot15 -8.186e-07 1.201e-07 -6.814 9.75e-12 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.3316 on 21584 degrees of freedom
Multiple R-squared: 0.6041, Adjusted R-squared: 0.6036
F-statistic: 1176 on 28 and 21584 DF, p-value: < 2.2e-16
マルチコを解消したが、p直で棄却できない項目(condition、grade)を説明変数から外す。
house_lm5<-lm(log(price)~.-id-date-zipcode-lat-long-yr_built-yr_renovated-sqft_basement-condition-grade,data=house_data)
summary(house_lm5)
Call:
lm(formula = log(price) ~ . - id - date - zipcode - lat - long -
yr_built - yr_renovated - sqft_basement - condition - grade,
data = house_data)
Residuals:
Min 1Q Median 3Q Max
-2.45489 -0.26683 0.01249 0.24625 1.60821
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.207e+01 1.205e-02 1001.212 < 2e-16 ***
bedrooms -3.423e-02 3.293e-03 -10.393 < 2e-16 ***
bathrooms 1.990e-02 5.320e-03 3.741 0.000184 ***
sqft_living 3.478e-04 7.383e-06 47.112 < 2e-16 ***
sqft_lot 2.687e-07 8.466e-08 3.173 0.001508 **
floors 1.044e-01 6.026e-03 17.326 < 2e-16 ***
waterfront1 3.213e-01 3.485e-02 9.220 < 2e-16 ***
view1 2.175e-01 1.994e-02 10.905 < 2e-16 ***
view2 1.647e-01 1.205e-02 13.664 < 2e-16 ***
view3 1.950e-01 1.656e-02 11.780 < 2e-16 ***
view4 3.314e-01 2.547e-02 13.010 < 2e-16 ***
sqft_above -9.795e-05 7.393e-06 -13.250 < 2e-16 ***
sqft_living15 1.682e-04 5.669e-06 29.674 < 2e-16 ***
sqft_lot15 -1.035e-06 1.291e-07 -8.017 1.14e-15 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.3567 on 21599 degrees of freedom
Multiple R-squared: 0.5417, Adjusted R-squared: 0.5414
F-statistic: 1964 on 13 and 21599 DF, p-value: < 2.2e-16
AIC(house_lm5)
[1] 16786.6
vif(house_lm5)
GVIF Df GVIF^(1/(2*Df))
bedrooms 1.594029 1 1.262549
bathrooms 2.852491 1 1.688932
sqft_living 7.811808 1 2.794961
sqft_lot 2.089329 1 1.445451
floors 1.798878 1 1.341223
waterfront 1.544181 1 1.242651
view 1.784447 4 1.075073
sqft_above 6.367104 1 2.523312
sqft_living15 2.565055 1 1.601579
sqft_lot15 2.109817 1 1.452521
AIC(16786.6)は大きくなり、bedroomsとsqft_aboveの説明がつかないが全体的にはよくなった。
stepとAICではAICの直が変わるが、step関数内で使用しているAICは定数項を除いたAIC(extractAIC)。 どちらでも意味合いとしては同じ。
house_lm6<-step(house_lm3)
Start: AIC=531415.5
price ~ (id + date + bedrooms + bathrooms + sqft_living + sqft_lot +
floors + waterfront + view + condition + grade + sqft_above +
sqft_basement + yr_built + yr_renovated + zipcode + lat +
long + sqft_living15 + sqft_lot15) - id - date - zipcode -
lat - long - yr_built - yr_renovated
Step: AIC=531415.5
price ~ bedrooms + bathrooms + sqft_living + sqft_lot + floors +
waterfront + view + condition + grade + sqft_above + sqft_living15 +
sqft_lot15
Df Sum of Sq RSS AIC
- sqft_lot 1 7.6241e+09 1.0277e+15 531414
- bathrooms 1 2.5187e+10 1.0278e+15 531414
<none> 1.0277e+15 531416
- sqft_living15 1 1.1964e+12 1.0289e+15 531439
- floors 1 1.6228e+12 1.0294e+15 531448
- bedrooms 1 3.3391e+12 1.0311e+15 531484
- sqft_lot15 1 3.7476e+12 1.0315e+15 531492
- sqft_above 1 7.6023e+12 1.0353e+15 531573
- condition 4 2.7015e+13 1.0548e+15 531968
- waterfront 1 2.8272e+13 1.0560e+15 532000
- view 4 2.9709e+13 1.0574e+15 532023
- sqft_living 1 7.7063e+13 1.1048e+15 532976
- grade 11 2.0609e+14 1.2338e+15 535344
Step: AIC=531413.7
price ~ bedrooms + bathrooms + sqft_living + floors + waterfront +
view + condition + grade + sqft_above + sqft_living15 + sqft_lot15
Df Sum of Sq RSS AIC
- bathrooms 1 2.5070e+10 1.0278e+15 531412
<none> 1.0277e+15 531414
- sqft_living15 1 1.1895e+12 1.0289e+15 531437
- floors 1 1.6167e+12 1.0294e+15 531446
- bedrooms 1 3.3470e+12 1.0311e+15 531482
- sqft_lot15 1 6.9010e+12 1.0346e+15 531556
- sqft_above 1 7.5972e+12 1.0353e+15 531571
- condition 4 2.7014e+13 1.0548e+15 531966
- waterfront 1 2.8266e+13 1.0560e+15 531998
- view 4 2.9742e+13 1.0575e+15 532022
- sqft_living 1 7.7093e+13 1.1048e+15 532975
- grade 11 2.0610e+14 1.2338e+15 535342
Step: AIC=531412.2
price ~ bedrooms + sqft_living + floors + waterfront + view +
condition + grade + sqft_above + sqft_living15 + sqft_lot15
Df Sum of Sq RSS AIC
<none> 1.0278e+15 531412
- sqft_living15 1 1.1933e+12 1.0290e+15 531435
- floors 1 1.6571e+12 1.0294e+15 531445
- bedrooms 1 3.5473e+12 1.0313e+15 531485
- sqft_lot15 1 6.8930e+12 1.0347e+15 531555
- sqft_above 1 7.5973e+12 1.0354e+15 531569
- condition 4 2.7131e+13 1.0549e+15 531967
- waterfront 1 2.8261e+13 1.0560e+15 531996
- view 4 2.9777e+13 1.0576e+15 532021
- sqft_living 1 8.9610e+13 1.1174e+15 533217
- grade 11 2.0637e+14 1.2341e+15 535345
summary(house_lm6)
Call:
lm(formula = price ~ bedrooms + sqft_living + floors + waterfront +
view + condition + grade + sqft_above + sqft_living15 + sqft_lot15,
data = house_data)
Residuals:
Min 1Q Median 3Q Max
-1632940 -117737 -18626 90275 4233342
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 6.907e+04 2.183e+05 0.316 0.751734
bedrooms -1.773e+04 2.054e+03 -8.631 < 2e-16 ***
sqft_living 1.863e+02 4.294e+00 43.383 < 2e-16 ***
floors 2.160e+04 3.661e+03 5.899 3.70e-09 ***
waterfront1 5.202e+05 2.135e+04 24.363 < 2e-16 ***
view1 1.433e+05 1.221e+04 11.738 < 2e-16 ***
view2 8.378e+04 7.386e+03 11.343 < 2e-16 ***
view3 1.354e+05 1.014e+04 13.352 < 2e-16 ***
view4 2.770e+05 1.567e+04 17.683 < 2e-16 ***
condition2 -1.283e+04 4.396e+04 -0.292 0.770318
condition3 -1.973e+04 4.089e+04 -0.483 0.629354
condition4 3.114e+04 4.092e+04 0.761 0.446728
condition5 1.065e+05 4.116e+04 2.588 0.009658 **
grade3 2.791e+04 2.553e+05 0.109 0.912958
grade4 3.740e+04 2.254e+05 0.166 0.868230
grade5 2.196e+04 2.222e+05 0.099 0.921278
grade6 5.735e+04 2.221e+05 0.258 0.796204
grade7 9.181e+04 2.221e+05 0.413 0.679291
grade8 1.538e+05 2.221e+05 0.692 0.488769
grade9 2.858e+05 2.222e+05 1.286 0.198417
grade10 4.723e+05 2.223e+05 2.125 0.033614 *
grade11 7.434e+05 2.226e+05 3.340 0.000839 ***
grade12 1.207e+06 2.237e+05 5.395 6.91e-08 ***
grade13 2.495e+06 2.309e+05 10.802 < 2e-16 ***
sqft_above -5.820e+01 4.608e+00 -12.632 < 2e-16 ***
sqft_living15 1.834e+01 3.663e+00 5.006 5.60e-07 ***
sqft_lot15 -6.788e-01 5.641e-02 -12.032 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 218200 on 21586 degrees of freedom
Multiple R-squared: 0.6472, Adjusted R-squared: 0.6467
F-statistic: 1523 on 26 and 21586 DF, p-value: < 2.2e-16
AIC(house_lm6)
[1] 592749.2
単純にする為、price-sqft_aboveの単回帰分析を考える
house_lm6<-lm(price~sqft_above, data=house_data)
house_lm6
Call:
lm(formula = price ~ sqft_above, data = house_data)
Coefficients:
(Intercept) sqft_above
59953.2 268.5
plot(house_data$sqft_above, house_data$price)
abline(house_lm6, col="red")
new_data<-data.frame(sqft_above=c(2000,4000,6000,8000))
new_data
predict(house_lm6, newdata = new_data)
1 2 3 4
596899.6 1133845.9 1670792.3 2207738.7
predict(house_lm6, newdata = new_data, interval = “confidence”)
predict(house_lm6, newdata = new_data, interval = “predict”)
plot(house_data\(sqft_above, house_data\)price, xlim=c(-1000, 12000), ylim=c(-1000,4e+6)) abline(house_lm6, col=“red”)
sqft_above_seq<-c(-1000:12000) new_data2<-data.frame(sqft_above=sqft_above_seq) conf_interval<-predict(house_lm6, newdata = new_data2, interval = “confidence”) lines(sqft_above_seq,conf_interval[,2],col=“blue”,lty=2) lines(sqft_above_seq,conf_interval[,3],col=“blue”,lty=2)
pred_interval<-predict(house_lm1, newdata = new_data2, interval = “prediction”) lines(sqft_above_seq,pred_interval[,2],col=“green”,lty=2) lines(sqft_above_seq,pred_interval[,3],col=“green”,lty=2)
train_idx<-sample(c(1:dim(house_data)[1]), size = dim(house_data)[1]*0.7) train <- house_data[train_idx, ] test <- house_data[-train_idx, ]
mymodel<-lm(price ~ bedrooms + floors + waterfront + view + condition + yr_built + yr_renovated + sqft_living15 + sqft_lot15, data=train)
ypred<-predict(mymodel, newdata = test) mse<-sum((test$price - ypred)^2)/length(ypred) mse rmse<-sqrt(mse) rmse